# Required Libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages --------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.4
## v tibble 3.0.1 v dplyr 1.0.0
## v tidyr 1.1.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## Warning: package 'ggplot2' was built under R version 3.6.2
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'tidyr' was built under R version 3.6.3
## Warning: package 'purrr' was built under R version 3.6.3
## Warning: package 'dplyr' was built under R version 3.6.3
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts ------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(naniar)
## Warning: package 'naniar' was built under R version 3.6.3
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.6.3
## corrplot 0.84 loaded
library(dplyr)
library(ggplot2)
library(GGally)
## Warning: package 'GGally' was built under R version 3.6.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# Read csv from project dir
life_df <- read_csv("C:/Users/e005108/Downloads/datasets_12603_17232_Life Expectancy Data.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## Country = col_character(),
## Status = col_character()
## )
## See spec(...) for full column specifications.
life_df <-
life_df %>%
filter(Year == 2014)
head(life_df)
## # A tibble: 6 x 22
## Country Year Status `Life expectanc~ `Adult Mortalit~ `infant deaths` Alcohol
## <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Afghan~ 2014 Devel~ 59.9 271 64 0.01
## 2 Albania 2014 Devel~ 77.5 8 0 4.51
## 3 Algeria 2014 Devel~ 75.4 11 21 0.01
## 4 Angola 2014 Devel~ 51.7 348 67 8.33
## 5 Antigu~ 2014 Devel~ 76.2 131 0 8.56
## 6 Argent~ 2014 Devel~ 76.2 118 8 7.93
## # ... with 15 more variables: `percentage expenditure` <dbl>, `Hepatitis
## # B` <dbl>, Measles <dbl>, BMI <dbl>, `under-five deaths` <dbl>, Polio <dbl>,
## # `Total expenditure` <dbl>, Diphtheria <dbl>, `HIV/AIDS` <dbl>, GDP <dbl>,
## # Population <dbl>, `thinness 1-19 years` <dbl>, `thinness 5-9 years` <dbl>,
## # `Income composition of resources` <dbl>, Schooling <dbl>
for (i in 1:length(names(life_df))) {
ifelse(
grep(" ",names(life_df)[i]) == TRUE,
names(life_df)[i] <- gsub(" ", "_", names(life_df)[i]),
next
)
}
names(life_df)
## [1] "Country" "Year"
## [3] "Status" "Life_expectancy"
## [5] "Adult_Mortality" "infant_deaths"
## [7] "Alcohol" "percentage_expenditure"
## [9] "Hepatitis_B" "Measles"
## [11] "BMI" "under-five_deaths"
## [13] "Polio" "Total_expenditure"
## [15] "Diphtheria" "HIV/AIDS"
## [17] "GDP" "Population"
## [19] "thinness__1-19_years" "thinness_5-9_years"
## [21] "Income_composition_of_resources" "Schooling"
summary(life_df)
## Country Year Status Life_expectancy
## Length:183 Min. :2014 Length:183 Min. :48.10
## Class :character 1st Qu.:2014 Class :character 1st Qu.:65.60
## Mode :character Median :2014 Mode :character Median :73.60
## Mean :2014 Mean :71.54
## 3rd Qu.:2014 3rd Qu.:76.85
## Max. :2014 Max. :89.00
##
## Adult_Mortality infant_deaths Alcohol percentage_expenditure
## Min. : 1.0 Min. : 0.00 Min. : 0.010 Min. : 0.00
## 1st Qu.: 66.0 1st Qu.: 0.00 1st Qu.: 0.010 1st Qu.: 11.06
## Median :135.0 Median : 2.00 Median : 0.320 Median : 151.10
## Mean :148.7 Mean : 24.56 Mean : 3.271 Mean : 1001.91
## 3rd Qu.:216.5 3rd Qu.: 18.00 3rd Qu.: 6.700 3rd Qu.: 703.21
## Max. :522.0 Max. :957.00 Max. :15.190 Max. :19479.91
## NA's :1
## Hepatitis_B Measles BMI under-five_deaths
## Min. : 2.00 Min. : 0 Min. : 2.00 Min. : 0.00
## 1st Qu.:79.00 1st Qu.: 0 1st Qu.:23.20 1st Qu.: 0.00
## Median :93.00 Median : 13 Median :47.40 Median : 3.00
## Mean :83.12 Mean : 1831 Mean :41.03 Mean : 32.89
## 3rd Qu.:97.00 3rd Qu.: 316 3rd Qu.:59.80 3rd Qu.: 22.00
## Max. :99.00 Max. :79563 Max. :77.10 Max. :1200.00
## NA's :10 NA's :2
## Polio Total_expenditure Diphtheria HIV/AIDS
## Min. : 8.00 Min. : 1.210 Min. : 2.00 Min. :0.100
## 1st Qu.:80.00 1st Qu.: 4.480 1st Qu.:83.00 1st Qu.:0.100
## Median :94.00 Median : 5.840 Median :94.00 Median :0.100
## Mean :84.73 Mean : 6.201 Mean :84.08 Mean :0.682
## 3rd Qu.:97.00 3rd Qu.: 7.740 3rd Qu.:97.00 3rd Qu.:0.400
## Max. :99.00 Max. :17.140 Max. :99.00 Max. :9.400
## NA's :2
## GDP Population thinness__1-19_years
## Min. : 12.28 Min. :4.100e+01 Min. : 0.100
## 1st Qu.: 617.99 1st Qu.:2.869e+05 1st Qu.: 1.500
## Median : 3154.51 Median :1.568e+06 Median : 3.300
## Mean : 10015.57 Mean :2.106e+07 Mean : 4.533
## 3rd Qu.: 8239.95 3rd Qu.:8.080e+06 3rd Qu.: 6.600
## Max. :119172.74 Max. :1.294e+09 Max. :26.800
## NA's :28 NA's :41 NA's :2
## thinness_5-9_years Income_composition_of_resources Schooling
## Min. : 0.100 Min. :0.3450 Min. : 4.90
## 1st Qu.: 1.500 1st Qu.:0.5700 1st Qu.:10.80
## Median : 3.400 Median :0.7220 Median :13.00
## Mean : 4.676 Mean :0.6884 Mean :12.89
## 3rd Qu.: 6.600 3rd Qu.:0.7960 3rd Qu.:14.90
## Max. :27.400 Max. :0.9450 Max. :20.40
## NA's :2 NA's :10 NA's :10
gg_miss_var(life_df)
life_df %>%
filter(
Status == "Developed"
) %>%
ggplot() +
geom_col(
aes(
x = reorder(Country, Life_expectancy),
y = Life_expectancy,
fill = Status
)
) +
coord_flip() +
theme(
axis.text.y = element_text(size=6),
)
life_df %>%
filter(
Status == "Developing"
) %>%
ggplot() +
geom_col(
aes(
x = reorder(Country, Life_expectancy),
y = Life_expectancy,
fill = Status
)
) +
coord_flip() +
theme(
axis.text.y = element_text(size=2),
)
corrplot(
cor(life_df[,-(1:3)]),
type="upper"
)
life <- life_df %>%
drop_na()
summary(life)
## Country Year Status Life_expectancy
## Length:131 Min. :2014 Length:131 Min. :48.10
## Class :character 1st Qu.:2014 Class :character 1st Qu.:64.65
## Mode :character Median :2014 Mode :character Median :72.00
## Mean :2014 Mean :70.52
## 3rd Qu.:2014 3rd Qu.:75.80
## Max. :2014 Max. :89.00
## Adult_Mortality infant_deaths Alcohol percentage_expenditure
## Min. : 2.0 Min. : 0.00 Min. : 0.010 Min. : 0.443
## 1st Qu.: 74.5 1st Qu.: 0.00 1st Qu.: 0.010 1st Qu.: 48.311
## Median :144.0 Median : 3.00 Median : 0.010 Median : 198.734
## Mean :160.4 Mean : 28.56 Mean : 3.061 Mean : 850.874
## 3rd Qu.:225.0 3rd Qu.: 20.00 3rd Qu.: 6.305 3rd Qu.: 718.324
## Max. :522.0 Max. :957.00 Max. :15.190 Max. :16255.162
## Hepatitis_B Measles BMI under-five_deaths
## Min. : 2.00 Min. : 0.0 Min. : 2.00 Min. : 0.00
## 1st Qu.:78.00 1st Qu.: 0.0 1st Qu.:22.85 1st Qu.: 1.00
## Median :91.00 Median : 10.0 Median :45.90 Median : 3.00
## Mean :81.71 Mean : 2042.9 Mean :40.48 Mean : 38.24
## 3rd Qu.:96.00 3rd Qu.: 289.5 3rd Qu.:59.45 3rd Qu.: 25.50
## Max. :99.00 Max. :79563.0 Max. :77.10 Max. :1200.00
## Polio Total_expenditure Diphtheria HIV/AIDS
## Min. : 8.0 Min. : 1.210 Min. : 2.00 Min. :0.1000
## 1st Qu.:78.0 1st Qu.: 4.485 1st Qu.:80.00 1st Qu.:0.1000
## Median :92.0 Median : 5.820 Median :92.00 Median :0.1000
## Mean :83.5 Mean : 6.107 Mean :83.89 Mean :0.8099
## 3rd Qu.:97.0 3rd Qu.: 7.630 3rd Qu.:97.00 3rd Qu.:0.5000
## Max. :99.0 Max. :13.730 Max. :99.00 Max. :9.4000
## GDP Population thinness__1-19_years
## Min. : 12.28 Min. :4.100e+01 Min. : 0.100
## 1st Qu.: 554.92 1st Qu.:2.876e+05 1st Qu.: 1.500
## Median : 2522.80 Median :1.563e+06 Median : 3.300
## Mean : 7256.85 Mean :2.227e+07 Mean : 4.648
## 3rd Qu.: 7438.05 3rd Qu.:8.059e+06 3rd Qu.: 6.650
## Max. :119172.74 Max. :1.294e+09 Max. :26.800
## thinness_5-9_years Income_composition_of_resources Schooling
## Min. : 0.100 Min. :0.3450 Min. : 5.30
## 1st Qu.: 1.550 1st Qu.:0.5440 1st Qu.:10.75
## Median : 3.500 Median :0.6970 Median :12.70
## Mean : 4.886 Mean :0.6697 Mean :12.68
## 3rd Qu.: 6.800 3rd Qu.:0.7790 3rd Qu.:14.70
## Max. :27.400 Max. :0.9360 Max. :20.40
range(life$Life_expectancy)
## [1] 48.1 89.0
22 Variables, 20 of them are Numerical, and 2 of them are Categorical. Variables we should drop: Country, Year Hepatitis.B has the Min Value and 1st Quartile difference of 76 which is too high (Factor?) Polio has the Min value and 1st QUartile difference of 70 which is too high (Factor?) Diphtheria has the min value and 1st quartile difference of 78 which is too high (Factor?)
According to the World Health Organization in 2018 they said that 86% of children in the world are receiving immunizations protecting them from these diseases. Source: https://www.chop.edu/centers-programs/vaccine-education-center/global-immunization/diseases-and-vaccines-world-view Let us use 86% as the benchmark to turn these columns into factors.
life_new <- life %>%
select(-Country, -Year) %>%
mutate(Hepatitis_B = ifelse(Hepatitis_B < 86, "<86% Immunized", ">=86% Immunized"),
Polio = ifelse(Polio < 86, "<86% Immunized", ">=86% Immunized"),
Diphtheria = ifelse(Diphtheria < 86, "<86% Immunized", ">=86% Immunized"),
Hepatitis_B = as.factor(Hepatitis_B),
Polio = as.factor(Polio),
Diphtheria = as.factor(Diphtheria))
str(life_new)
## tibble [131 x 20] (S3: tbl_df/tbl/data.frame)
## $ Status : chr [1:131] "Developing" "Developing" "Developing" "Developing" ...
## $ Life_expectancy : num [1:131] 59.9 77.5 75.4 51.7 76.2 74.6 82.7 81.4 72.5 71.4 ...
## $ Adult_Mortality : num [1:131] 271 8 11 348 118 12 6 66 119 132 ...
## $ infant_deaths : num [1:131] 64 0 21 67 8 1 1 0 5 98 ...
## $ Alcohol : num [1:131] 0.01 4.51 0.01 8.33 7.93 ...
## $ percentage_expenditure : num [1:131] 73.5 428.7 54.2 24 847.4 ...
## $ Hepatitis_B : Factor w/ 2 levels "<86% Immunized",..: 1 2 2 1 2 2 2 2 2 2 ...
## $ Measles : num [1:131] 492 0 0 11699 1 ...
## $ BMI : num [1:131] 18.6 57.2 58.4 22.7 62.2 54.1 66.1 57.1 51.5 17.7 ...
## $ under-five_deaths : num [1:131] 86 1 24 101 9 1 1 0 6 121 ...
## $ Polio : Factor w/ 2 levels "<86% Immunized",..: 1 2 2 1 2 2 2 2 2 2 ...
## $ Total_expenditure : num [1:131] 8.18 5.88 7.21 3.31 4.79 ...
## $ Diphtheria : Factor w/ 2 levels "<86% Immunized",..: 1 2 2 1 2 2 2 2 2 2 ...
## $ HIV/AIDS : num [1:131] 0.1 0.1 0.1 2 0.1 0.1 0.1 0.1 0.1 0.1 ...
## $ GDP : num [1:131] 613 4576 548 479 12245 ...
## $ Population : num [1:131] 327582 288914 39113313 2692466 42981515 ...
## $ thinness__1-19_years : num [1:131] 17.5 1.2 6 8.5 1 2.1 0.6 1.8 2.8 18.1 ...
## $ thinness_5-9_years : num [1:131] 17.5 1.3 5.8 8.3 0.9 2.1 0.6 2 2.9 18.6 ...
## $ Income_composition_of_resources: num [1:131] 0.476 0.761 0.741 0.527 0.825 0.739 0.936 0.892 0.752 0.57 ...
## $ Schooling : num [1:131] 10 14.2 14.4 11.4 17.3 12.7 20.4 15.9 12.2 10 ...
## - attr(*, "spec")=
## .. cols(
## .. Country = col_character(),
## .. Year = col_double(),
## .. Status = col_character(),
## .. `Life expectancy` = col_double(),
## .. `Adult Mortality` = col_double(),
## .. `infant deaths` = col_double(),
## .. Alcohol = col_double(),
## .. `percentage expenditure` = col_double(),
## .. `Hepatitis B` = col_double(),
## .. Measles = col_double(),
## .. BMI = col_double(),
## .. `under-five deaths` = col_double(),
## .. Polio = col_double(),
## .. `Total expenditure` = col_double(),
## .. Diphtheria = col_double(),
## .. `HIV/AIDS` = col_double(),
## .. GDP = col_double(),
## .. Population = col_double(),
## .. `thinness 1-19 years` = col_double(),
## .. `thinness 5-9 years` = col_double(),
## .. `Income composition of resources` = col_double(),
## .. Schooling = col_double()
## .. )
Check the correlation of the numerical variables
life_numerical <- life_new %>%
select_if(is.numeric)
ggcorr(life_numerical,
label = T,
label_size = 2,
label_round = 2,
hjust = 1,
size = 3,
color = "black",
layout.exp = 5,
low = "forestgreen",
mid = "gray95",
high = "darkorange",
name = "Correlation")
Life Expenctancy has a strong positive correlation with Schooling and Income_composition_of_resources.
Life_expectancy has a strong negative correlation with Adult_Mortality which is understandable since if mortality rate in adults is high, Life exepctancy would be lower.
Population and Measels show almost no correlation with Life_Expectancy. infant_deaths and under.five_deaths have a correlation of 1 (100%) which would tell us that there is multicollinearity between them. We should drop one of the variables, and I believe under 5 deaths should be dropped.
life_new<- life_new[-c(10)]
##Check the distribution of the categorical variables
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 3
## Status count percentage
## <chr> <int> <chr>
## 1 Developed 19 14.5%
## 2 Developing 112 85.5%
According to the boxplots, the distribution of higher life_expectancy resides in the developed countries, with the plot even showing that the median for developed countries (2nd Quartile) beind skewed higher towards the 3rd quartile
We want to know if there is any significant difference between the average life expectancy in Developed and Developing countries.
summary(aov(Life_expectancy ~ Status, data = life_new))
## Df Sum Sq Mean Sq F value Pr(>F)
## Status 1 2453 2453.1 44.12 7.83e-10 ***
## Residuals 129 7173 55.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The ANOVA test tells us that there is a significant difference between the life expectancy of Developed countries and the life expectancy of Developing countries.
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 3
## Hepatitis_B count percentage
## <fct> <int> <chr>
## 1 <86% Immunized 44 33.59%
## 2 >=86% Immunized 87 66.41%
More than 1/3 of countries have less than 86% immunized for Hepatitis B. The Life Expectancy of the countries with greater than or equal to 86% Immunized is higher than the countries which have less that 86% Immunized. Note that the median (2nd Quartile) for >=86% immunized is skewed upwards towards the 3rd quartile.
We want to know if there is any significant difference between the average life expectancy in Developed and Developing countries.
summary(aov(Life_expectancy ~ Hepatitis_B, data = life_new))
## Df Sum Sq Mean Sq F value Pr(>F)
## Hepatitis_B 1 1088 1087.6 16.43 8.67e-05 ***
## Residuals 129 8539 66.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The ANOVA test tells us that there is a significant difference between the life expectancy of countries that immmunized >=86% and the life expectancy of countries that immunized less than 86% for Hep B.
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 3
## Polio count percentage
## <fct> <int> <chr>
## 1 <86% Immunized 46 35.11%
## 2 >=86% Immunized 85 64.89%
For Polio Immunizations, higher life expectancy resides with the countries that have >=86% Immunized. Note that the median (2nd Quartile) for >=86% immunized is skewed upwards towards the 3rd quartile.
summary(aov(Life_expectancy ~ Polio, data = life_new))
## Df Sum Sq Mean Sq F value Pr(>F)
## Polio 1 2295 2294.8 40.38 3.3e-09 ***
## Residuals 129 7332 56.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
According to the ANOVA test, there is a significant difference between the life expectancy of countries that immmunized >=86% and the life expectancy of countries that immunized less than 86% for Polio.